keep = ls()

######################
#Get person locations#
######################

#get county data
ia_pk = fread('./raw/ia_soldier_xwalk.csv')
wi_pk = fread('./raw/wi_soldier_xwalk.csv') %>% .[, list(state = state_1860, county = county_1860, personpk)]
other_pk = fread('./raw/ct_il_ma_me_vt_soldier_xwalk.csv') %>% 
  .[, list(state = state_1860, county = county_1860, personpk)] %>%
  .[!(personpk %in% c(ia_pk$personpk, wi_pk$personpk))]

pk = rbindlist( list(ia_pk, wi_pk, other_pk)) %>% .[!is.na(personpk)]

#Clean state names
pk[state %in% "IA", state := "IOWA"]
pk[state %in% "AL", state := "ALABAMA"]
pk[state %in% "AR", state := "ARKANSAS"]
pk[state %in% "CO", state := "COLORADO"]
pk[state %in% "CT", state := "Connecticut"]
pk[state %in% "DT", state := "Dakota"]
pk[state %in% "GA", state := "Georgia"]
pk[state %in% "MA", state := "Massachusetts"]
pk[state %in% "MD", state := "Maryland"]
pk[state %in% "ME", state := "Maine"]
pk[state %in% "MI", state := "Michigan"]
pk[state %in% "MN", state := "Minnesota"]
pk[state %in% "MO", state := "Missouri"]
pk[state %in% "MS", state := "Mississippi"]
pk[state %in% "NE", state := "Nebraska"]
pk[state %in% "NH", state := "New Hampshire"]
pk[state %in% "NJ", state := "New Jersey"]
pk[state %in% "NS", state := "Nova Scotia"]
pk[state %in% "NY", state := "New York"]
pk[state %in% "OH", state := "Ohio"]
pk[state %in% "PA", state := "Pennsylvania"]
pk[state %in% "TN", state := "Tennessee"]
pk[state %in% "TX", state := "Texas"]
pk[state %in% "VA", state := "Virginia"]
pk[state %in% "VT", state := "Vermont"]
pk[state %in% "WI", state := "Wisconsin"]
pk[state %in% "IL", state := "ILLINOIS"]
pk[state %in% "IN", state := "INDIANA"]
pk[state %in% "KS", state := "KANSAS"]
pk[state %in% "KY", state := "KENTUCKY"]
pk[state %in% "LA", state := "LOUISIANA"]

pk[, state := toupper(state)]
pk[, county := toupper(county)]

#####################
#Get state-regiments#
#####################

regiments = fread(paste0(cwdb_path, '/regiments.csv'))[allegiance %in% 'Union']

union_states = c("IA", "WI", 'IL', 'CT', 'MA', "VT", "ME")

state_regiments = vector(mode = 'list', length = length(union_states))
names(state_regiments) = union_states

for (s in union_states){
  
  state_regiments[[s]] = regiments[state %in% s, regimentpk]
  
}


##################
#Get unit rosters#
##################

roster = fread(paste0(cwdb_path,'/regiment_roster_table.csv'), 
               select = c('regimentpk', 'personpk', 'mustindate', 'outdate', 'outmethodsimple', 'unitcompany'))


state_rosters = vector(mode = 'list', length = length(union_states))
names(state_rosters) = union_states

for (s in union_states){
  state_rosters[[s]] = roster[regimentpk %in% state_regiments[[s]], personpk]
}

##############
#Get Soldiers#
##############
keep = c('lastname', 'firstname', 'midname', 'residence', 'state', 'enlistage', 'enlistdate', 'enlistdateest', 'personpk', 'allegiance', 'survivedwar', 'stateserved', 'enlistplace', 'birthyear', 'waspow', 'waswounded', 'mergerank', 'cleanrank', 'broadrank', 'overall_rank', 'overall_broadrank')
persons = fread(paste0(cwdb_path, '/person_table.csv'), select = keep) 
setkey(persons, allegiance)
persons = persons[J("Union"), nomatch = 0L]

#Keep soldiers by state
state_persons = vector(mode = 'list', length = length(union_states))
names(state_persons) = union_states


for (s in union_states){
  
  state_persons_temp = persons[state %in% s | stateserved %in% s | grepl(paste0(s,"$"), enlistplace), personpk]
  
  state_persons_pk = unique(c(state_persons_temp, state_rosters[[s]]))
  state_persons[[s]] = persons[personpk %in% state_persons_pk, list(personpk,
                                                                    lastname,
                                                                    firstname,
                                                                    midname,
                                                                    residence,
                                                                    state,
                                                                    enlistage,
                                                                    birthyear,
                                                                    enlistdate,
                                                                    enlistage,
                                                                    enlistplace,
                                                                    stateserved,
                                                                    survivedwar,
                                                                    waspow,
                                                                    waswounded,
                                                                    enlistdateest,
                                                                    allegiance,
                                                                    cleanrank)]
  
}

rm(persons); gc()

#Get birth year
for (s in union_states){
  temp = state_persons[[s]]
  temp[, enlist_year := enlistdate %>% strptime(format = "%Y-%m-%d") %>% year]
  temp[!is.na(birthyear), birth_year_full := birthyear]
  temp[is.na(birthyear) & !is.na(enlistage) & !is.na(enlist_year), birth_year_full := enlist_year - enlistage]
  state_persons[[s]]= temp
} 


use_persons = rbindlist(state_persons)
all_use_persons = lapply(state_persons, function(x) x[, unique(personpk)]) %>% unlist
all_use_regiments = roster[personpk %in% all_use_persons, unique(regimentpk)]


############################
#Get within-unit experience#
############################
roster_min = roster[regimentpk %in% unlist(state_regiments)]

roster_min[, died := personpk %in% use_persons[survivedwar == 'N', personpk]]
roster_min[, wounded := personpk %in% use_persons[waswounded == 'Y', personpk]]
roster_min[, disabled := outmethodsimple %in% "Disabled"]
roster_min[, indate := strptime(mustindate, "%Y-%m-%d")]
roster_min[, outdate := strptime(outdate, "%Y-%m-%d")]
roster_min[, time_in := difftime(outdate, indate, 'days') %>% as.numeric(units = 'days')]
roster_min[indate > outdate, c('indate', 'outdate') := list(outdate, indate)]

#Make company
roster_min[unitcompany != "", c('companypk') := list(.GRP) , by = list(regimentpk, unitcompany)]

#Get usable sample
roster_min[, useTime := !is.na(indate) & !is.na(outdate)]
roster_min[, useCompany := !is.na(companypk)]

#Merge in county of residence
setkey(roster_min, personpk)
setkey(pk, personpk)
roster_min = pk[roster_min]
roster_min[, state_county := paste(state, county)]
rm(roster); gc()

#Get company casualty rate
##########################


u_companies = roster_min[(useTime) & (useCompany), companypk] %>% unique %>% na.omit
c_list = vector('list', length = length(u_companies))

for (i in seq_along(u_companies)) {
  c = u_companies[i]
  a = roster_min[(useTime) & (useCompany) & companypk %in% c, list(personpk, companypk, died, disabled, indate, outdate)]
  b = roster_min[(useTime) & (useCompany) & companypk %in% c, list(personpk, companypk, died, disabled, indate, outdate)]
  setkey(a, companypk, indate, outdate)
  setkey(b, companypk, indate, outdate)
  
  c_list[[i]] = foverlaps(na.omit(a), na.omit(b), by.x = c('companypk', 'indate', 'outdate'), by.y = c('companypk', 'indate', 'outdate')) %>% 
    .[personpk != i.personpk, list(in_company_n = .N, 
                                   company_deaths = sum(i.died & (i.outdate < outdate)),
                                   company_disabled = sum(i.disabled & (i.outdate < outdate))), by = list(personpk, companypk)]
}

company_casualty_rate = rbindlist(c_list, fill = T) 

rm(list = c('a', 'b', 'c_list'))
gc()


#Get out-of-county company casualty rate
########################################

u_companies = roster_min[(useTime) & (useCompany), companypk] %>% unique %>% na.omit
c_list = vector('list', length = length(u_companies))

for (i in seq_along(u_companies)) {
  c = u_companies[i]
  a = roster_min[(useTime) & (useCompany) & companypk %in% c, list(personpk, companypk, died, disabled, indate, outdate, state_county)]
  b = roster_min[(useTime) & (useCompany) & companypk %in% c, list(personpk, companypk, died, disabled, indate, outdate, state_county)]
  setkey(a, companypk, indate, outdate)
  setkey(b, companypk, indate, outdate)
  
  c_list[[i]] = foverlaps(na.omit(a), na.omit(b), by.x = c('companypk', 'indate', 'outdate'), by.y = c('companypk', 'indate', 'outdate')) %>% 
    .[personpk != i.personpk & (state_county != i.state_county), 
      list(in_company_n_ooc = .N, 
          company_deaths_ooc = sum(i.died & (i.outdate < outdate)),
          company_disabled_ooc = sum(i.disabled & (i.outdate < outdate))), 
      by = list(personpk, companypk)]
}

company_casualty_rate_ooc = rbindlist(c_list, fill = T) 

rm(list = c('a', 'b', 'c_list'))
gc()


#Get regimental casualty rate
#############################

#Loop over regiments to reduce memory use
u_regiments = roster_min$regimentpk %>% unique %>% na.omit
r_list = vector('list', length = length(u_regiments))

for (i in seq_along(u_regiments)) {
  r = u_regiments[i]
  a = roster_min[(useTime) & regimentpk %in% r, list(personpk, regimentpk, died, disabled, indate, outdate)]
  b = roster_min[(useTime) & regimentpk %in% r, list(personpk, regimentpk, died, disabled, indate, outdate)]
  setkey(a, regimentpk, indate, outdate)
  setkey(b, regimentpk, indate, outdate)
  r_list[[i]] = foverlaps(na.omit(a), na.omit(b), by.x = c('regimentpk', 'indate', 'outdate'), by.y = c('regimentpk', 'indate', 'outdate')) %>% 
                .[personpk != i.personpk, 
                  list(in_regiment_n = .N, 
                       deaths = sum(i.died & (i.outdate < outdate)),
                       disabled = sum(i.disabled & (i.outdate < outdate))), 
                  by = list(personpk, regimentpk)]
}

regiment_casualty_rate = rbindlist(r_list, fill = T) 

rm(list = c('a', 'b', 'r_list'))
gc()

#Get alternate regimental casualty / combat data
################################################

#Get regimental casualty records
regcas = fread(paste0(cwdb_path,'/regcas.csv'))[regimentpk %in% regiments$regimentpk]
regcas[, indate := strptime(date, "%Y-%m-%d")]
regcas[, outdate := indate]


a = roster_min[(useTime) & !is.na(regimentpk), list(personpk, regimentpk, indate, outdate)]
b = regcas[!is.na(indate) & !is.na(regimentpk), list(regimentpk, indate, outdate, killed, wounded, pow, missing)]

setkey(a, regimentpk, indate, outdate)
setkey(b, regimentpk, indate, outdate)

regiment_combat_data = foverlaps(a, b, by.x = c('regimentpk', 'indate', 'outdate'), by.y = c('regimentpk', 'indate', 'outdate')) %>%
  .[, list(combat_days = .N,
           kia = sum(killed),
           wia = sum(wounded),
           pow = sum(pow), 
           mia = sum(missing)), by = list(personpk, regimentpk)]

rm(list = c('a', 'b'))
gc()


#######################
#Combat alongside USCT#
#######################

#USCT combat data
usct_regiments = regiments[state %in% "UC", regimentpk]
regcas_usct = fread(paste0(cwdb_path, '/regcas.csv'))[regimentpk %in% usct_regiments]
regcas_usct[, indate := strptime(date, "%Y-%m-%d")]
regcas_usct[, outdate := indate]

usct_combat = regcas_usct[!is.na(indate) & place != "", list(indate, outdate, place, usct_combat = 1, regimentpk)]
usct_combat = usct_combat[, list(usct_combat = sum(usct_combat)), by = list(indate, outdate, place)]

setkey(regcas, indate, outdate, place)
setkey(usct_combat, indate, outdate, place)

regcas_usct_combat = usct_combat[regcas, allow.cartesian = T][!is.na(indate) & place != ""]

#Get personal USCT combat experience
a = roster_min[(useTime) & !is.na(regimentpk), list(personpk, regimentpk, indate, outdate)]
b = regcas_usct_combat[!is.na(indate) & !is.na(regimentpk), list(regimentpk, indate, outdate, usct_combat)]

setkey(a, regimentpk, indate, outdate)
setkey(b, regimentpk, indate, outdate)

regiment_usct_combat_data = foverlaps(a, b, by.x = c('regimentpk', 'indate', 'outdate'), by.y = c('regimentpk', 'indate', 'outdate')) %>%
  .[, list(usct_combat_regiments = sum(usct_combat, na.rm = T),
           usct_combat_days = sum(usct_combat > 0, na.rm = T)), by = list(personpk, regimentpk)]

rm(list = c('a', 'b', 'regcas_usct_combat', 'usct_combat', 'regcas', 'regcas_usct'))
gc()


############################
#Time in Mixed Race Brigade#
############################

#Load regimental assignemtns
reg_assign = fread(paste0(cwdb_path, '/assignus.csv'))[regimentpk %in% c(usct_regiments, regiments$regimentpk)]
reg_assign = reg_assign[, list(regimentpk,fromdatel, fromyear, frommonth, todatel, toyear, tomonth, assignnum, brigade, division, corps, army, armycode, armyname)]

reg_assign[, u_corps := paste(army, corps, sep = ":")]
reg_assign[, u_division := paste(u_corps, division, sep = ":")]
reg_assign[, u_brigade := paste(u_division, brigade, sep = ":")]
reg_assign[, u_brigade_alt := paste(u_corps, brigade, sep = ":")]

#Clean dates
reg_assign[, from_date := as.Date(strptime(fromdatel, format = '%Y-%m-%d'))]
reg_assign[, to_date := as.Date(strptime(todatel, format = '%Y-%m-%d'))]
reg_assign[!is.na(from_date) & is.na(to_date), to_date_flag := 1]
reg_assign[!is.na(from_date) & is.na(to_date), to_date := as.Date(paste(toyear,tomonth,'01',sep = '-')) - 1]

reg_assign[is.na(from_date) & !is.na(to_date), from_date_flag := 1]
reg_assign[is.na(from_date) & !is.na(to_date), from_date := as.Date(paste(fromyear,frommonth,'01',sep = '-'))]

from_to_idx = is.na(reg_assign$from_date) & is.na(reg_assign$to_date)
reg_assign[from_to_idx, from_to_date_flag := 1]
reg_assign[from_to_idx, from_date := as.Date(paste(fromyear,frommonth,'01',sep = '-'))]
reg_assign[from_to_idx, to_date := as.Date(paste(toyear,tomonth,'01',sep = '-')) - 1]

#Fix to_date imputation when in same month
reg_assign[to_date < from_date & !is.na(to_date_flag) & fromyear == toyear & frommonth==tomonth,
           to_date := as.Date(paste(toyear,tomonth+1,'01',sep = '-')) - 1]

reg_assign[to_date < from_date & !is.na(from_to_date_flag) & fromyear == toyear & frommonth==tomonth,
           to_date := as.Date(paste(toyear,tomonth+1,'01',sep = '-')) - 1]


reg_assign = reg_assign[from_date < to_date, list(date = seq.Date(from_date, to_date, by = 'day'),
                                                  army,
                                                  u_corps,
                                                  u_division,
                                                  u_brigade,
                                                  to_date_flag,
                                                  from_date_flag,
                                                  from_to_date_flag,
                                                  from_date,
                                                  to_date), by = list(regimentpk, assignnum)]

#Brigade assignments
unit_assignments = reg_assign[regimentpk %in% setdiff(regiments$regimentpk, usct_regiments), list(regimentpk, date, u_brigade)]
all_assignments = reg_assign[, list(regimentpk_j = regimentpk, date, u_brigade)]
brigade_composition = all_assignments[, list(regiments = length(regimentpk_j), usct_regiments = sum(regimentpk_j %in% usct_regiments)), by = list(u_brigade, date)]

setkey(unit_assignments, u_brigade, date)
setkey(brigade_composition, u_brigade, date)

brigade_match = brigade_composition[unit_assignments, allow.cartesian = T]
brigade_match[, indate := strptime(as.character(date), "%Y-%m-%d")]
brigade_match[, outdate := indate]

#Get personal time in brigade assignments

#Loop over regiments to reduce memory use
u_regiments = roster_min$regimentpk %>% unique %>% na.omit
r_list = vector('list', length = length(u_regiments))

for (i in seq_along(u_regiments)) {
  r = u_regiments[i]
  a = roster_min[(useTime) & regimentpk %in% r, list(personpk, regimentpk, indate, outdate)]
  b = brigade_match[!is.na(indate) & regimentpk %in% r, list(regimentpk, indate, outdate, usct_regiments)]
  setkey(a, regimentpk, indate, outdate)
  setkey(b, regimentpk, indate, outdate)
  r_list[[i]] = foverlaps(a, b, by.x = c('regimentpk', 'indate', 'outdate'), by.y = c('regimentpk', 'indate', 'outdate')) %>%
    .[, list(usct_brigade_r = sum(usct_regiments, na.rm = T),
             usct_brigade_any = sum(usct_regiments > 0, na.rm = T),
             assign_days = .N, 
             u_assign_days = unique(indate) %>% length), by = list(personpk, regimentpk)]
}

regiment_usct_brigade_data = rbindlist(r_list, fill = T) 

rm(list = c('a', 'b', 'r_list', 'unit_assignments', 'brigade_composition', 'all_assignments', 'brigade_match', 'reg_assign', 'from_to_idx'))
gc()

#################################
#Merge experience to roster data#
#################################

roster_company = roster_min[!is.na(companypk) & (useTime), list(time_in = sum(time_in), indate = min(indate), outdate = max(outdate)) , by = list(personpk, companypk, regimentpk, died)]

#Merge in company casualty rates
setkey(roster_company, personpk, companypk)
setkey(company_casualty_rate, personpk, companypk)
roster_company = company_casualty_rate[roster_company]

setkey(roster_company, personpk, companypk)
setkey(company_casualty_rate_ooc, personpk, companypk)
roster_company = company_casualty_rate_ooc[roster_company]

#merge in regiment-level data
roster_regiment = roster_company[, list(
                                      time_in = sum(time_in),
                                      indate = min(indate),
                                      outdate = max(outdate),
                                      in_company_n = sum(in_company_n),
                                      company_deaths = sum(company_deaths),
                                      company_disabled = sum(company_disabled),
                                      in_company_n_ooc = sum(in_company_n_ooc),
                                      company_deaths_ooc = sum(company_deaths_ooc),
                                      company_disabled_ooc = sum(company_disabled_ooc)
                                    ), by = list(personpk, regimentpk)]

#regiment combat experience
setkey(roster_regiment, personpk, regimentpk)
setkey(regiment_combat_data, personpk, regimentpk)

roster_regiment = regiment_combat_data[roster_regiment]

#regiment casualty rates
setkey(roster_regiment, personpk, regimentpk)
setkey(regiment_casualty_rate, personpk, regimentpk)

roster_regiment = regiment_casualty_rate[roster_regiment]

#regiment usct combat experience
setkey(roster_regiment, personpk, regimentpk)
setkey(regiment_usct_combat_data, personpk, regimentpk)

roster_regiment = regiment_usct_combat_data[roster_regiment]

#regiment interracial brigades
setkey(roster_regiment, personpk, regimentpk)
setkey(regiment_usct_brigade_data, personpk, regimentpk)

roster_regiment = regiment_usct_brigade_data[roster_regiment]

rm(list = c('roster_company', 'company_casualty_rate', 'company_casualty_rate_ooc', 'regiment_casualty_rate',
            'regiment_combat_data', 'regiment_usct_combat_data', 'regiment_usct_brigade_data'))
gc()

##############################
#Collapse to individual level#
##############################

roster_use = roster_regiment[, list(
                                indate = min(indate),
                                outdate = max(outdate),
                                time_in = sum(time_in),
                                in_company_n = sum(in_company_n, na.rm = T),
                                company_deaths = sum(company_deaths, na.rm =T),
                                company_disabled = sum(company_disabled, na.rm =T),
                                in_company_n_ooc = sum(in_company_n_ooc, na.rm = T),
                                company_deaths_ooc = sum(company_deaths_ooc, na.rm =T),
                                company_disabled_ooc = sum(company_disabled_ooc, na.rm =T),
                                regiment_deaths = sum(deaths, na.rm = T),
                                in_regiment_n = sum(in_regiment_n, na.rm = T),
                                combat_days = sum(combat_days, na.rm =T),
                                kia = sum(kia, na.rm =T),
                                mia = sum(mia, na.rm =T),
                                wia = sum(wia, na.rm =T),
                                pow = sum(pow, na.rm =T),
                                usct_combat_days = sum(usct_combat_days, na.rm =T),
                                usct_combat_regiments = sum(usct_combat_regiments, na.rm =T),
                                u_assign_days = sum(u_assign_days, na.rm =T),
                                assign_days = sum(assign_days, na.rm =T),
                                usct_brigade_any = sum(usct_brigade_any, na.rm =T),
                                usct_brigade_r = sum(usct_brigade_r, na.rm =T)
                              ),
                             by = personpk]

roster_use[, company_casualty_rate := company_deaths / in_company_n]
roster_use[, company_casualty_rate_ooc := company_deaths_ooc / in_company_n_ooc]
roster_use[, regiment_casualty_rate := regiment_deaths / in_regiment_n]

roster_state = use_persons[,list(personpk, stateserved, survivedwar)]
setkey(roster_use, personpk)
setkey(roster_state, personpk)

roster_use = roster_state[roster_use]
roster_use[, use_state := stateserved %in% union_states]
roster_use[, survived := survivedwar != "N"]

########################################
#Merge in individual wartime experience#
########################################

use_persons = use_persons[, list(personpk, survivedwar, birth_year_full)]

setkey(use_persons, personpk)
setkey(roster_use, personpk)

use_persons = roster_use[use_persons]

#Merge in geographic location
setkey(use_persons, personpk)
setkey(pk, personpk)
to_collapse = use_persons[pk]

#Calculate veteran counts
to_collapse[, c('veterans', 'veterans_s') := list(.N, 
                                                 sum(i.survivedwar != "N")
                                                 ) , by = list(state, county)]
#Calculate county mean experiences for survivors
county_veteran_data = to_collapse[survivedwar != "N", lapply(.SD, function(x) as.numeric(x) %>% mean(., na.rm = T)) , by = list(state, county, veterans, veterans_s), .SDcols = names(roster_use)[-1:-3]]

#Enlistment data to use:
fwrite(county_veteran_data[state != "" & county != ""], './cleaned/agg_county_enlistment.csv')

#Cleanup
rm(list = setdiff(ls(), c(keep, 'keep')))
gc()